In [6]:
# libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly_express as px

# importing all needed files

pop = pd.read_excel("/Users/silas/Documents/College/Spring 2022/Anthro Homework Research/Homework 5/Pop edited.xlsx")
poll_df = pd.ExcelFile("/Users/silas/Documents/College/Spring 2022/Anthro Homework Research/Homework 5/Pollution.xlsx")
poll_unorder = pd.read_excel(poll_df, 'Main', usecols=[*range(0,3)])
poll = poll_unorder.sort_values(by=['Year', 'State'])
income = pd.read_excel("/Users/silas/Documents/College/Spring 2022/Anthro Homework Research/Homework 5/Income.xlsx")
suicide = pd.read_excel("/Users/silas/Documents/College/Spring 2022/Anthro Homework Research/Homework 5/Suicide.xlsx")
In [7]:
# merging population, income, CO pollution

merge1 = pd.merge(pop, income, on=['Year', 'State'])
merge1
pop_poll = pd.merge(merge1, poll, on=['Year', 'State'])
pop_poll
Out[7]:
State Year Pop Income CO
0 AL 2000 4452339 51798 1435.409350
1 AK 2000 627500 77275 154.916970
2 AZ 2000 5165993 58172 1037.815130
3 AR 2000 2678511 43424 765.290420
4 CA 2000 34002756 68456 5828.319510
... ... ... ... ... ...
1045 VA 2020 8590563 81947 480.646957
1046 WA 2020 7693612 81083 429.406833
1047 WV 2020 1784787 51615 114.621768
1048 WI 2020 5832655 67094 321.539561
1049 WY 2020 582328 65108 61.977679

1050 rows × 5 columns

In [8]:
# merging population, income, suicide

pop_suicide = pd.merge(merge1, suicide, on=['Year', 'State'])
pop_suicide
Out[8]:
State Year Pop Income Suicide
0 AL 2005 4557808 47885 11.5
1 AK 2005 663661 72041 19.9
2 AZ 2005 5939292 58319 16.4
3 AR 2005 2779154 47250 14.3
4 CA 2005 36132147 66710 9.1
... ... ... ... ... ...
395 VA 2020 8590563 81947 13.5
396 WA 2020 7693612 81083 15.2
397 WV 2020 1784787 51615 19.4
398 WI 2020 5832655 67094 14.5
399 WY 2020 582328 65108 30.5

400 rows × 5 columns

In [9]:
# merging all variables

df_all = pd.merge(pop_poll, suicide, on=['Year', 'State'])
df_all
Out[9]:
State Year Pop Income CO Suicide
0 AL 2005 4557808 47885 827.544647 11.5
1 AK 2005 663661 72041 133.547307 19.9
2 AZ 2005 5939292 58319 697.087586 16.4
3 AR 2005 2779154 47250 522.034956 14.3
4 CA 2005 36132147 66710 2810.511599 9.1
... ... ... ... ... ... ...
395 VA 2020 8590563 81947 480.646957 13.5
396 WA 2020 7693612 81083 429.406833 15.2
397 WV 2020 1784787 51615 114.621768 19.4
398 WI 2020 5832655 67094 321.539561 14.5
399 WY 2020 582328 65108 61.977679 30.5

400 rows × 6 columns

In [10]:
# animation from 2000 to 2020 for income v CO pollution

px.scatter(data_frame=pop_poll, x="Income", y="CO", animation_frame="Year", animation_group="State",
          size="Pop", color="State", hover_name="State", title="Income VS CO Pollution", size_max=40)
In [11]:
# animation from 2004, 2014-2020 comparing imcome with suicide over time

px.scatter(data_frame=pop_suicide, x="Income", y="Suicide", animation_frame="Year", animation_group="State",
           size="Pop", color="State", hover_name="State", range_x=[40000,90000], range_y=[0,30], 
           title="Income VS Suicide", size_max=40)
In [12]:
# this was somewhat of an experimental animation, tried to find a way to compare all 6 varibles 
# (State, Year, Pollution, Suicide, Income, and Population) and it worked out better than I expected

px.scatter(df_all, x="CO", y="Suicide", animation_frame="Year", animation_group="Income",
          size="Pop", color="Income", hover_name='State', range_x=[0, 1500],range_y=[5, 30],
           title="CO Pollution VS Suicide", size_max=40)